Analisis Data Kecelakaan UK 2017

In [83]:
#import library
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import month, year, hour, to_date, col, udf, dayofmonth, dayofyear, to_timestamp,from_unixtime, unix_timestamp
from pyspark.sql.types import DateType
from datetime import datetime

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

import plotly
import plotly.plotly as py
plotly.tools.set_credentials_file(username='akbar0102', api_key='HhbZVjpsfob4yDF1aGpd')
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
In [2]:
#buat objek spark
spark = SparkSession.builder.appName('uk_accident').getOrCreate()
In [3]:
#sc = SparkContext()
sqlContext = SQLContext(spark)
In [4]:
#read data csv
acc = spark.read.csv('raw_data/Acc.csv', inferSchema=True, header=True)
veh = spark.read.csv('raw_data/Veh.csv', inferSchema=True, header=True)
cas = spark.read.csv('raw_data/Cas.csv', inferSchema=True, header=True)
In [5]:
#lihat jumlah kolom dan rekord
print('Cas',cas.count(), len(cas.columns))
print('Veh',veh.count(), len(veh.columns))
print('Acc',acc.count(), len(acc.columns))
Cas 170993 16
Veh 238926 23
Acc 129982 32

Accident

Analisis data accident UK 2017

In [7]:
acc.printSchema()
root
 |-- Accident_Index: string (nullable = true)
 |-- Location_Easting_OSGR: integer (nullable = true)
 |-- Location_Northing_OSGR: integer (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Police_Force: integer (nullable = true)
 |-- Accident_Severity: integer (nullable = true)
 |-- Number_of_Vehicles: integer (nullable = true)
 |-- Number_of_Casualties: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Day_of_Week: integer (nullable = true)
 |-- Time: string (nullable = true)
 |-- Local_Authority_(District): integer (nullable = true)
 |-- Local_Authority_(Highway): string (nullable = true)
 |-- 1st_Road_Class: integer (nullable = true)
 |-- 1st_Road_Number: integer (nullable = true)
 |-- Road_Type: integer (nullable = true)
 |-- Speed_limit: integer (nullable = true)
 |-- Junction_Detail: integer (nullable = true)
 |-- Junction_Control: integer (nullable = true)
 |-- 2nd_Road_Class: integer (nullable = true)
 |-- 2nd_Road_Number: integer (nullable = true)
 |-- Pedestrian_Crossing-Human_Control: integer (nullable = true)
 |-- Pedestrian_Crossing-Physical_Facilities: integer (nullable = true)
 |-- Light_Conditions: integer (nullable = true)
 |-- Weather_Conditions: integer (nullable = true)
 |-- Road_Surface_Conditions: integer (nullable = true)
 |-- Special_Conditions_at_Site: integer (nullable = true)
 |-- Carriageway_Hazards: integer (nullable = true)
 |-- Urban_or_Rural_Area: integer (nullable = true)
 |-- Did_Police_Officer_Attend_Scene_of_Accident: integer (nullable = true)
 |-- LSOA_of_Accident_Location: string (nullable = true)

In [6]:
#label district
district = spark.read.csv('raw_data/label_accident/district.csv', inferSchema=True, header=True, sep=';')
#label weather
weather = spark.read.csv('raw_data/label_accident/weather.csv', inferSchema=True, header=True, sep=';')
In [7]:
#register sql
district.registerTempTable('district')
acc.registerTempTable('acc')
weather.registerTempTable('weather')
In [8]:
#jumlah kecelakaan yang paling banyak terjadi untuk setiap distrik
acc.groupBy('Local_Authority_(District)').count().orderBy('count', ascending=False).show()
+--------------------------+-----+
|Local_Authority_(District)|count|
+--------------------------+-----+
|                       300| 2724|
|                       204| 1711|
|                         1| 1687|
|                         9| 1333|
|                       596| 1149|
|                         5| 1121|
|                         8| 1098|
|                        30| 1093|
|                       215| 1093|
|                       926| 1075|
|                       102| 1054|
|                        27| 1039|
|                       200| 1015|
|                        10| 1003|
|                        31|  990|
|                        28|  973|
|                         2|  967|
|                         4|  946|
|                        20|  935|
|                       635|  934|
+--------------------------+-----+
only showing top 20 rows

In [9]:
#jumlah korban kecelakaan setiap district secara keseluruhan
dis_sum = sqlContext.sql("SELECT `Local_Authority_(District)`, label, sum(Number_of_Casualties) as sum_casualties\
                     FROM acc,district \
                     WHERE `Local_Authority_(District)` == code \
                     GROUP BY `Local_Authority_(District)`, label \
                     ORDER BY sum_casualties DESC LIMIT 10")
In [10]:
dis_sum.show()
+--------------------------+------------+--------------+
|Local_Authority_(District)|       label|sum_casualties|
+--------------------------+------------+--------------+
|                       300|  Birmingham|          3664|
|                       204|       Leeds|          2203|
|                         1| Westminster|          1916|
|                       596|    Cornwall|          1679|
|                         9|     Lambeth|          1536|
|                       215|   Sheffield|          1415|
|                       102|  Manchester|          1379|
|                       200|    Bradford|          1369|
|                        30|      Barnet|          1343|
|                       926|Glasgow City|          1330|
+--------------------------+------------+--------------+

In [11]:
dis_sum = dis_sum.toPandas()
In [84]:
data = [go.Bar(
            x=dis_sum['label'],
            y=dis_sum['sum_casualties'],
            text=dis_sum['sum_casualties'],
            textposition='auto'
    )]

py.iplot(data, filename='district-accident')
/home/hadoop/.local/lib/python3.5/site-packages/IPython/core/display.py:689: UserWarning:

Consider using IPython.display.IFrame instead

Out[84]:

Grafik distrik dengan jumlah korban terbanyak, yaitu paling tinggi adalah Birmingham

In [42]:
#jumlah casualties setiap district berdasarkan severity(fatal,slight,serious)
district_cas = sqlContext.sql("SELECT `Local_Authority_(District)`, district.label as district, sev.label as severity, sum(`count(Casualty_Severity)`) as casualty \
                     FROM acc,district,sev \
                     WHERE `Local_Authority_(District)` == code and acc.Accident_Index == sev.Accident_Index \
                     GROUP BY `Local_Authority_(District)`, district.label, sev.label, Casualty_Severity \
                     ORDER BY district.label")
In [43]:
district_cas.show()
+--------------------------+---------------+--------+--------+
|Local_Authority_(District)|       district|severity|casualty|
+--------------------------+---------------+--------+--------+
|                       910|  Aberdeen City|   Fatal|       2|
|                       910|  Aberdeen City|  Slight|     148|
|                       910|  Aberdeen City| Serious|      34|
|                       911|  Aberdeenshire|  Slight|     217|
|                       911|  Aberdeenshire|   Fatal|       7|
|                       911|  Aberdeenshire| Serious|     122|
|                       557|           Adur|  Slight|     150|
|                       557|           Adur| Serious|      32|
|                       557|           Adur|   Fatal|       2|
|                        60|      Allerdale| Serious|      49|
|                        60|      Allerdale|   Fatal|       5|
|                        60|      Allerdale|  Slight|     277|
|                       320|   Amber Valley|  Slight|     195|
|                       320|   Amber Valley| Serious|      41|
|                       320|   Amber Valley|   Fatal|       4|
|                       912|          Angus|   Fatal|      10|
|                       912|          Angus|  Slight|     138|
|                       912|          Angus| Serious|      43|
|                       913|Argyll and Bute| Serious|      54|
|                       913|Argyll and Bute|   Fatal|       4|
+--------------------------+---------------+--------+--------+
only showing top 20 rows

Tabel yang menunjukan jumlah korban berdasarkan tingkat keparahannya untuk setiap distrik

In [44]:
#accident berdasarkan kondisi cuaca
sqlContext.sql("SELECT Weather_Conditions, label, count(Weather_Conditions) \
               FROM acc, weather \
               WHERE acc.Weather_Conditions == weather.code AND acc.Weather_Conditions > 0\
               GROUP BY Weather_Conditions, label \
               ORDER BY Weather_Conditions").show()
+------------------+--------------------+-------------------------+
|Weather_Conditions|               label|count(Weather_Conditions)|
+------------------+--------------------+-------------------------+
|                 1|  Fine no high winds|                   104710|
|                 2|Raining no high w...|                    14300|
|                 3|Snowing no high w...|                      641|
|                 4|   Fine + high winds|                     1241|
|                 5|Raining + high winds|                     1102|
|                 6|Snowing + high winds|                       99|
|                 7|         Fog or mist|                      624|
|                 8|               Other|                     2674|
|                 9|             Unknown|                     4590|
+------------------+--------------------+-------------------------+

In [45]:
#accident berdasarkan kondisi kecahayaan
acc.filter(acc['Light_Conditions'] > 0).groupBy('Light_Conditions').count().orderBy('Light_Conditions').show()
+----------------+-----+
|Light_Conditions|count|
+----------------+-----+
|               1|92678|
|               4|26847|
|               5|  929|
|               6| 6678|
|               7| 2849|
+----------------+-----+

In [46]:
#accident berdasarkan kondisi jalan
acc.filter(acc['Road_Surface_Conditions'] > 0).groupBy('Road_Surface_Conditions').count().orderBy('Road_Surface_Conditions').show()
+-----------------------+-----+
|Road_Surface_Conditions|count|
+-----------------------+-----+
|                      1|92628|
|                      2|32564|
|                      3|  428|
|                      4| 2319|
|                      5|  106|
+-----------------------+-----+

In [47]:
#untuk analisis tanggal, sebelumnya tipe data string -> Date
func =  udf (lambda x: datetime.strptime(x, '%d/%m/%Y'), DateType())
df = acc.withColumn('new_date', func(col('Date')))
In [48]:
#tambah kolom baru, diambil dari date yaitu untuk kolom bulan dan tanggal
df = df.withColumn('month', month(df['new_date']))
df = df.withColumn('day', dayofmonth(df['new_date']))
In [49]:
#kecelakaan setiap bulan dalam tahun 2017
acc_mon = df.groupBy('month').count().orderBy('month')
In [50]:
acc_mon = acc_mon.toPandas()
In [85]:
data = [go.Bar(
            x=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'],
            y=acc_mon['count'],
            text=acc_mon['count'],
            textposition='auto'
    )]

py.iplot(data, filename='accident-month')
Out[85]:

Grafik kecelakaan yang terjadi dalam setahun per bulan

In [52]:
#kecelakaan berdasarkan nama hari dalam seminggu setiap harinya selama tahun 2017
dow = acc.groupBy('Day_of_Week').count().orderBy('Day_of_Week')
In [53]:
#ubah ke pandas
dow = dow.toPandas()
In [86]:
data = [go.Bar(
            x=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'],
            y=dow['count'],
            text=dow['count'],
            textposition='auto'
    )]

py.iplot(data, filename='day-of-week')
Out[86]:

Grafik jumlah kecelakaan yang terjadi dalam setahun untuk setiap harinya

In [55]:
#ambil data kecelakaan berdasarkan waktu untuk setiap tanggal dalam sebulan, apabila hendak berdasarkan bulan keberapa menggunakan filter
#tes = df.filter((df['month'] == 1)).groupBy('Time', 'day', 'month').count().orderBy('month','day', 'Time')
tes = df.groupBy('Time', 'day', 'month').count().orderBy('month','day', 'Time')
In [56]:
#kecelakaan setiap waktu pada setiap tanggal dan setiap bulan dalam tahun 2017
tes.show()
+-----+---+-----+-----+
| Time|day|month|count|
+-----+---+-----+-----+
|00:05|  1|    1|    1|
|00:10|  1|    1|    1|
|00:20|  1|    1|    1|
|00:22|  1|    1|    1|
|00:25|  1|    1|    1|
|00:26|  1|    1|    1|
|00:28|  1|    1|    1|
|00:30|  1|    1|    2|
|00:40|  1|    1|    1|
|00:45|  1|    1|    1|
|01:00|  1|    1|    6|
|01:11|  1|    1|    1|
|01:15|  1|    1|    1|
|01:20|  1|    1|    3|
|01:23|  1|    1|    1|
|01:26|  1|    1|    1|
|01:28|  1|    1|    1|
|01:29|  1|    1|    1|
|01:30|  1|    1|    2|
|01:35|  1|    1|    3|
+-----+---+-----+-----+
only showing top 20 rows

In [57]:
#kecelakaan dalam 24 jam selama tahun 2017
tim = tes.filter((tes['Time'] != 'null') & (tes['Time'] != 'NULL')).groupBy(hour('Time').alias('hour')).sum('count').orderBy('hour')
In [58]:
tim = tim.toPandas()
In [59]:
tim.head()
Out[59]:
hour sum(count)
0 0 1949
1 1 1352
2 2 1008
3 3 887
4 4 736
In [89]:
trace = go.Bar(
            x=tim['hour'],
            y=tim['sum(count)'],
            text=tim['sum(count)'],
            textposition='auto'
    )

data = [trace]

layout = go.Layout(
    title='Accident Hourly 2017',
    xaxis=dict(tickmode='linear')
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='hour-accident')
Out[89]:

Grafik kecelakaan yang terjadi selama setahun per waktu terjadinya

In [61]:
#kecelakaan pada setiap tanggal dalam setiap bulan selama tahun 2017
#coba untuk bulan 1 dalam 30 hari
tgl = tes.filter(tes['month'] == 1).groupBy('day', 'month').sum('count').orderBy('day')
In [62]:
tgl = tgl.toPandas()
In [88]:
trace = go.Bar(
            x=tgl['day'],
            y=tgl['sum(count)'],
            text=tgl['sum(count)'],
            textposition='auto'
    )


data = [trace]

layout = go.Layout(
    title='Accident Monthly - January 2017',
    xaxis=dict(tickmode='linear')
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='monthly-accident')
Out[88]:

Grafik kecelakaan yang terjadi pada tahun 2017 di bulan januari selama satu bulan

In [64]:
#filter lat dan lon untuk melihat kecelakaan yang terjadi pada jalan Motorways dan A(M)
maps = acc.filter(
    (acc['Latitude']!='null') & (acc['Latitude']!='NULL') &
    (acc['Longitude']!='null') & (acc['Longitude']!='NULL')).groupBy('Latitude','Longitude','Local_Authority_(District)','Accident_Severity').count().orderBy('Latitude','Longitude')
In [65]:
maps = maps.toPandas()
In [66]:
maps.head()
Out[66]:
Latitude Longitude Local_Authority_(District) Accident_Severity count
0 49.929558 -6.294338 596 2 1
1 49.995028 -5.210809 596 2 1
2 50.003174 -5.214534 596 3 1
3 50.041051 -5.101821 596 3 1
4 50.048617 -5.223844 596 2 1
In [90]:
data = [ dict(
        type = 'scattermapbox',
        lon = maps['Longitude'],
        lat = maps['Latitude'],
        locationmode='United Kingdom',
        marker = dict(
            size = 2,
            cmin = maps['Accident_Severity'].min(),
            color = maps['Accident_Severity'],
            cmax = maps['Accident_Severity'].max(),
            colorbar=dict(
                title="Accident_Severity"
            ),
            colorscale= [[0, "rgb(0,255,0)"], [0.5, "rgb(255,128,0)"], [1, "rgb(255,0,2)"]]
        ),
        text = maps['Local_Authority_(District)']
        )]

layout = dict(
        height = 900,
        width = 900,
        title = 'UK Accident 2017',
        colorbar = True,
        mapbox= dict(
            accesstoken="pk.eyJ1IjoiYWtiYXIwMTAyIiwiYSI6ImNqcXFtc2c0MTBjaWIzeG8xb25xbmhid3oifQ.0Xv30-rNVlvIzDy0mZoROg",
            center= dict(
              lat= 54, 
              lon= -4
            ), 
            zoom= 4.7
        )
    )

fig = dict( data=data, layout=layout ,mapboxAccessToken= 'HhbZVjpsfob4yDF1aGpd' )
py.iplot( fig, validate=False, filename='uk-accident' )
/home/hadoop/.local/lib/python3.5/site-packages/plotly/plotly/plotly.py:230: UserWarning:

Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points

If the visualization you're using aggregates points (e.g., box plot, histogram, etc.) you can disregard this warning.

Out[90]:

Map berdasarkan keparahan kecelakaan

In [78]:
cek = sqlContext.sql("SELECT `Local_Authority_(District)`, label, Latitude, Longitude, 1st_Road_Class \
                     FROM acc, district \
                     WHERE `Local_Authority_(District)` == code AND 1st_Road_Class == 1 OR 1st_Road_Class == 2 \
                     AND Latitude != 'null' AND 'Latitude'!='NULL' AND Longitude != 'null' AND 'Longitude'!='NULL' \
                     GROUP BY `Local_Authority_(District)`, label, Latitude, Longitude, 1st_Road_Class \
                     ORDER BY label")
In [81]:
cek= cek.toPandas()
In [82]:
data = [ dict(
        type = 'scattermapbox',
        lon = cek['Longitude'],
        lat = cek['Latitude'],
        locationmode='United Kingdom',
        marker = dict(
            size = 2,
            cmin = 0,
            color = cek['1st_Road_Class'],
            cmax = cek['1st_Road_Class'].max(),
            colorbar=dict(
                title="1st_Road_Class"
            ),
            colorscale= [[0, "rgb(0,255,0)"], [0.5, "rgb(255,128,0)"], [1, "rgb(255,0,2)"]]
        ),
        text = cek['label']
        )]

layout = dict(
        height = 900,
        width = 900,
        title = 'UK Accident 2017',
        colorbar = True,
        mapbox= dict(
            accesstoken="pk.eyJ1IjoiYWtiYXIwMTAyIiwiYSI6ImNqcXFtc2c0MTBjaWIzeG8xb25xbmhid3oifQ.0Xv30-rNVlvIzDy0mZoROg",
            center= dict(
              lat= 54, 
              lon= -4
            ), 
            zoom= 4.7
        )
    )

fig = dict( data=data, layout=layout ,mapboxAccessToken= 'HhbZVjpsfob4yDF1aGpd' )
iplot( fig, validate=False, filename='uk-accident-district')

Map berdasarkan kecelakaan yang terjadi pada 1st_Road Motorways dan A(M)

In [71]:
#jumlah kendaraan yang terlibat pada setiap kategori kecelakaan
acc.groupBy('Accident_Severity').sum('Number_of_Vehicles').show()
+-----------------+-----------------------+
|Accident_Severity|sum(Number_of_Vehicles)|
+-----------------+-----------------------+
|                1|                   2963|
|                3|                 197158|
|                2|                  38805|
+-----------------+-----------------------+

In [72]:
#jumlah kecelakaan berdasarkan keparahannya
acc.groupBy('Accident_Severity').count().show()
+-----------------+------+
|Accident_Severity| count|
+-----------------+------+
|                1|  1676|
|                3|105772|
|                2| 22534|
+-----------------+------+

In [ ]:
 
In [ ]:
 
In [ ]:
 

Casualty

Analisis data casualty UK 2017

In [101]:
cas.printSchema()
root
 |-- Accident_Index: string (nullable = true)
 |-- Vehicle_Reference: integer (nullable = true)
 |-- Casualty_Reference: integer (nullable = true)
 |-- Casualty_Class: integer (nullable = true)
 |-- Sex_of_Casualty: integer (nullable = true)
 |-- Age_of_Casualty: integer (nullable = true)
 |-- Age_Band_of_Casualty: integer (nullable = true)
 |-- Casualty_Severity: integer (nullable = true)
 |-- Pedestrian_Location: integer (nullable = true)
 |-- Pedestrian_Movement: integer (nullable = true)
 |-- Car_Passenger: integer (nullable = true)
 |-- Bus_or_Coach_Passenger: integer (nullable = true)
 |-- Pedestrian_Road_Maintenance_Worker: integer (nullable = true)
 |-- Casualty_Type: integer (nullable = true)
 |-- Casualty_Home_Area_Type: integer (nullable = true)
 |-- Casualty_IMD_Decile: integer (nullable = true)

In [14]:
#read referensi label
age_band = spark.read.csv('raw_data/label_casualty/age_band.csv', inferSchema=True, header=True, sep=';')
cas_severity = spark.read.csv('raw_data/label_casualty/cas_severity.csv', inferSchema=True, header=True, sep=';')
In [15]:
#register tabel untuk sql
cas.registerTempTable('cas')
age_band.registerTempTable('age_band')
cas_severity.registerTempTable('cas_severity')
In [16]:
#sql jumlah casualty berdasrkan rentang usia
res = sqlContext.sql("SELECT Age_Band_of_Casualty, label, count(Age_Band_of_Casualty) \
                     FROM cas,age_band \
                     WHERE Age_Band_of_Casualty == code \
                     GROUP BY Age_Band_of_Casualty, label")
In [17]:
res.orderBy('Age_Band_of_Casualty').show()
+--------------------+-------+---------------------------+
|Age_Band_of_Casualty|  label|count(Age_Band_of_Casualty)|
+--------------------+-------+---------------------------+
|                   1|  0 - 5|                       3280|
|                   2| 6 - 10|                       4823|
|                   3|11 - 15|                       7618|
|                   4|16 - 20|                      18390|
|                   5|21 - 25|                      20933|
|                   6|26 - 35|                      35171|
|                   7|36 - 45|                      25530|
|                   8|46 - 55|                      23802|
|                   9|56 - 65|                      14124|
|                  10|66 - 75|                       8291|
|                  11|Over 75|                       6817|
+--------------------+-------+---------------------------+

In [18]:
# jumlah casualty severity berdasarkan index kecelakaan, untuk setiap keparahan kecelakaan (slight, serious, fatal)
sev = sqlContext.sql("SELECT Accident_Index ,Casualty_Severity, label, count(Casualty_Severity) \
                FROM cas, cas_severity \
                WHERE Casualty_Severity == code \
                GROUP BY Casualty_Severity, label, Accident_Index \
                ORDER BY Accident_Index, Casualty_Severity")
In [19]:
#register tabel sql
sev.registerTempTable('sev')
In [20]:
sev.show()
+--------------+-----------------+-------+------------------------+
|Accident_Index|Casualty_Severity|  label|count(Casualty_Severity)|
+--------------+-----------------+-------+------------------------+
| 2017010001708|                1|  Fatal|                       1|
| 2017010001708|                2|Serious|                       1|
| 2017010001708|                3| Slight|                       1|
| 2017010009342|                3| Slight|                       1|
| 2017010009344|                3| Slight|                       1|
| 2017010009348|                3| Slight|                       1|
| 2017010009350|                2|Serious|                       1|
| 2017010009351|                3| Slight|                       1|
| 2017010009353|                3| Slight|                       2|
| 2017010009354|                3| Slight|                       1|
| 2017010009357|                2|Serious|                       1|
| 2017010009358|                2|Serious|                       1|
| 2017010009360|                3| Slight|                       4|
| 2017010009362|                3| Slight|                       1|
| 2017010009363|                3| Slight|                       3|
| 2017010009364|                2|Serious|                       1|
| 2017010009365|                3| Slight|                       1|
| 2017010009366|                3| Slight|                       1|
| 2017010009375|                3| Slight|                       1|
| 2017010009376|                3| Slight|                       1|
+--------------+-----------------+-------+------------------------+
only showing top 20 rows

In [ ]:
 
In [21]:
#jumlah casualty berdasarkan rentang usia tanpa label, hanya kode yang merepresentasikan rentang usia
cas.filter(cas['Age_Band_of_Casualty'] > 0).groupBy('Age_Band_of_Casualty').count().orderBy('Age_Band_of_Casualty').show()
+--------------------+-----+
|Age_Band_of_Casualty|count|
+--------------------+-----+
|                   1| 3280|
|                   2| 4823|
|                   3| 7618|
|                   4|18390|
|                   5|20933|
|                   6|35171|
|                   7|25530|
|                   8|23802|
|                   9|14124|
|                  10| 8291|
|                  11| 6817|
+--------------------+-----+

In [22]:
#jumlah casualty berdasarkan jenis kelamin
jk = cas.filter(cas['Sex_of_Casualty'] > 0).groupBy('Sex_of_Casualty').count()
In [23]:
pdf1 = jk.toPandas()
#pdf1.plot(kind='bar',x='Sex_of_Casualty',y='count')
#pdf1
In [24]:
pdf1['count'][0]
Out[24]:
101379
In [25]:
data = [go.Bar(
            x=['Male','Female'],
            y=pdf1['count']
    )]

iplot(data, filename='sex-casuality')

Grafik total secara keseluruhan jenis kelamin pria dan wanita yang menjadi korban pada saat kecelakaan

In [26]:
#jumlah casualty berdasarkan jenis kelamin, untuk setiap rentang usia
male = cas.filter((cas['Sex_of_Casualty'] > 0) & (cas['Age_Band_of_Casualty'] > 0) & (cas['Sex_of_Casualty'] == 1)).groupBy('Sex_of_Casualty', 'Age_Band_of_Casualty').count().orderBy('Sex_of_Casualty', 'Age_Band_of_Casualty')
In [27]:
#jumlah casualty berdasarkan jenis kelamin, untuk setiap rentang usia
female = cas.filter((cas['Sex_of_Casualty'] > 0) & (cas['Age_Band_of_Casualty'] > 0) & (cas['Sex_of_Casualty'] == 2)).groupBy('Sex_of_Casualty', 'Age_Band_of_Casualty').count().orderBy('Sex_of_Casualty', 'Age_Band_of_Casualty')
In [28]:
male = male.toPandas()
female = female.toPandas()
In [29]:
trace1 = go.Bar(
    x=['0-5', '6-10', '11-15', '16-20', '21-25','26-35','36-45','46-55','56-65','66-75','Over 75'],
    y=male['count'],
    name='Male'
)
trace2 = go.Bar(
    x=['0-5', '6-10', '11-15', '16-20', '21-25','26-35','36-45','46-55','56-65','66-75','Over 75'],
    y=female['count'],
    name='Female'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-bar')

Grafik rentang usia yang menjadi korban kecelakaan berdasarkan jenis kelaminnya, dari grafik dapat dilihat yang paling banyak adalah rentang usia 26-35 tahun dan korban terbanyak untuk setiap rentang usianya berjenis kelamin laki-laki.

In [30]:
cas.groupBy('Casualty_Type','Casualty_IMD_Decile').count().orderBy('Casualty_Type', 'Casualty_IMD_Decile').show()
+-------------+-------------------+-----+
|Casualty_Type|Casualty_IMD_Decile|count|
+-------------+-------------------+-----+
|           -1|                  3|    1|
|           -1|                  5|    2|
|            0|                 -1| 7161|
|            0|                  1| 2540|
|            0|                  2| 2620|
|            0|                  3| 2324|
|            0|                  4| 1953|
|            0|                  5| 1636|
|            0|                  6| 1447|
|            0|                  7| 1174|
|            0|                  8| 1130|
|            0|                  9| 1010|
|            0|                 10|  810|
|            1|                 -1| 4710|
|            1|                  1| 1383|
|            1|                  2| 1622|
|            1|                  3| 1656|
|            1|                  4| 1537|
|            1|                  5| 1388|
|            1|                  6| 1261|
+-------------+-------------------+-----+
only showing top 20 rows

In [31]:
cas.groupBy('Casualty_Severity').count().show()
+-----------------+------+
|Casualty_Severity| count|
+-----------------+------+
|                1|  1793|
|                3|144369|
|                2| 24831|
+-----------------+------+

In [ ]:
 
In [ ]:
 
In [ ]:
 

Vehicle

Analisis data vehicle UK 2017

In [103]:
veh.printSchema()
root
 |-- Accident_Index: string (nullable = true)
 |-- Vehicle_Reference: integer (nullable = true)
 |-- Vehicle_Type: integer (nullable = true)
 |-- Towing_and_Articulation: integer (nullable = true)
 |-- Vehicle_Manoeuvre: integer (nullable = true)
 |-- Vehicle_Location-Restricted_Lane: integer (nullable = true)
 |-- Junction_Location: integer (nullable = true)
 |-- Skidding_and_Overturning: integer (nullable = true)
 |-- Hit_Object_in_Carriageway: integer (nullable = true)
 |-- Vehicle_Leaving_Carriageway: integer (nullable = true)
 |-- Hit_Object_off_Carriageway: integer (nullable = true)
 |-- 1st_Point_of_Impact: integer (nullable = true)
 |-- Was_Vehicle_Left_Hand_Drive?: integer (nullable = true)
 |-- Journey_Purpose_of_Driver: integer (nullable = true)
 |-- Sex_of_Driver: integer (nullable = true)
 |-- Age_of_Driver: integer (nullable = true)
 |-- Age_Band_of_Driver: integer (nullable = true)
 |-- Engine_Capacity_(CC): integer (nullable = true)
 |-- Propulsion_Code: integer (nullable = true)
 |-- Age_of_Vehicle: integer (nullable = true)
 |-- Driver_IMD_Decile: integer (nullable = true)
 |-- Driver_Home_Area_Type: integer (nullable = true)
 |-- Vehicle_IMD_Decile: integer (nullable = true)

In [32]:
#read referensi label
veh_type = spark.read.csv('raw_data/label_vehicle/veh_type.csv', inferSchema=True, header=True, sep=';')
In [33]:
#register tabel sql
veh.registerTempTable('veh')
veh_type.registerTempTable('veh_type')
In [34]:
#kendaraan yang paling banyak terjadi kecelakaan
veh_type = sqlContext.sql("SELECT Vehicle_Type, label, count(Vehicle_Type) as number_of_accident \
                FROM veh, veh_type \
                WHERE Vehicle_Type == code \
                GROUP BY Vehicle_Type, label \
                ORDER BY number_of_accident DESC")
In [35]:
veh_type = veh_type.toPandas()
In [36]:
veh_type.head()
Out[36]:
Vehicle_Type label number_of_accident
0 9 Car 168347
1 1 Pedal cycle 18954
2 19 Van / Goods 3.5 tonnes mgw or under 12479
3 3 Motorcycle 125cc and under 8464
4 5 Motorcycle over 500cc 6062
In [37]:
data = [go.Bar(
            x=veh_type['label'],
            y=veh_type['number_of_accident']
    )]

iplot(data, filename='veh-type')

Grafik kendaraan yang paling banyak terjadi kecelakaan, yaitu terjadi paling banyak pada mobil, kemudian sepeda yang kemungkinan merupakan korban kecelakaan

In [38]:
#jenis kelamin pengemudi yang kecelakaan
sod = veh.filter(veh['Sex_of_Driver'] > 0).groupBy('Sex_of_Driver').count()
In [39]:
sod = sod.toPandas()
In [40]:
sod
Out[40]:
Sex_of_Driver count
0 1 153964
1 3 19832
2 2 65127
In [41]:
data = [go.Bar(
            x=['Male', 'Female', 'Unknown'],
            y=sod['count'],
            name=""
    )]

iplot(data, filename='sex-of-driver')

Grafik jenis kelamin dari supir yang membawa kendaraan yang terjadi kecelakaan, lebih banyak pria daripada wanita

In [ ]: